import pandas as pd
ufo_data = pd.read_csv("nuforc_reports.csv")
ufo_data.head(10)
ufo_data.tail(10)
ufo_data.isnull().sum()
ufo_data.shape
# Leaving only the necessary columns
df1 = ufo_data[['city', 'state', 'date_time', 'shape', 'text']]
df1.head(5)
# Removing rows with missing values
df1 = df1.dropna(axis=0).reset_index(drop=True)
df1.shape
# Fixing an abbreviation duplication issue
df1['state'] = df1['state'].apply(lambda x: 'QC' if x=='QB' else x)
# Creating a list of Canadian provinces
canada = ['ON', 'QC', 'AB', 'BC', 'NB', 'MB',
'NS', 'SK', 'NT', 'NL', 'YT', 'PE']
# Creating new columns: `country`, `year`, `month`, and `time`
df1['country'] = df1['state'].apply(\
lambda x: 'Canada' if x in canada else 'USA')
df1['year'] = df1['date_time'].apply(lambda x: x[:4]).astype(int)
df1['month'] = df1['date_time'].apply(lambda x: x[5:7]).astype(int)
df1['month'] = df1['month'].replace({1: 'Jan', 2: 'Feb', 3: 'Mar',
4: 'Apr', 5: 'May', 6: 'Jun',
7: 'Jul', 8: 'Aug', 9: 'Sep',
10: 'Oct', 11: 'Nov', 12: 'Dec'})
df1['time'] = df1['date_time'].apply(lambda x: x[-8:-6]).astype(int)
# Dropping an already used column
df1 = df1.drop(['date_time'], axis=1)
# Dropping duplicated rows
df1 = df1.drop_duplicates().reset_index(drop=True)
df1.head(20)
round(df1['country'].value_counts(normalize=True)*100)
import matplotlib.pyplot as plt
import seaborn as sns
# Creating a series object for UFO occurences by month, in %
months = df1['month'].value_counts(normalize=True)\
[['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']]*100
# Defining a function for creating and customizing a figure in matplotlib
def create_customized_fig():
fig, ax = plt.subplots(figsize=(12,6))
plt.title('UFO occurences by month, %', fontsize=27)
plt.ylim(0,15)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
ax.tick_params(bottom=False)
sns.despine()
return ' '
# PLOTTING
create_customized_fig()
# Creating a stem plot
plt.stem(months.index, months)
plt.show()
# Creating a series of shapes and their frequencies
# in ascending order
shapes = df1['shape'].value_counts(normalize=True,
ascending=True)*100
fig, ax = plt.subplots(figsize=(12,9))
# Creating a vertical stem plot
plt.hlines(y=shapes.index,
xmin=0, xmax=shapes,
color='slateblue',
linestyle='dotted', linewidth=5)
plt.plot(shapes, shapes.index,
'*', ms=17,
c='darkorange')
plt.title('UFO shapes by sighting frequency, %', fontsize=29)
plt.xlim(0,25)
plt.yticks(fontsize=20)
plt.xticks(fontsize=20)
ax.tick_params()
sns.despine()
plt.show()
from wordcloud import WordCloud, STOPWORDS
# Gathering sighting descriptions from all American witnesses
text = ''
for t in df1[df1['country']=='USA'].loc[:, 'text']:
text += ' ' + t
fig = plt.subplots(figsize=(10,10))
# Creating a basic word cloud
wordcloud = WordCloud(width=1000, height=1000,
collocations=False).generate(text)
plt.title('USA collective description of UFO', fontsize=27)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
# Saving the word cloud
wordcloud.to_file('wordcloud_usa.png')
# Gathering sighting descriptions from all Canadian witnesses
text = ''
for t in df1[df1['country']=='Canada'].loc[:, 'text']:
text += ' ' + t
# Creating a user stopword list
stopwords = ['one', 'two', 'first', 'second', 'saw', 'see', 'seen',
'looked', 'looking', 'look', 'went', 'minute', 'back',
'noticed', 'north', 'south', 'east', 'west', 'nuforc',
'appeared', 'shape', 'side', 'witness', 'sighting',
'going', 'note', 'around', 'direction', 'approximately',
'still', 'away', 'across', 'seemed', 'time']
fig = plt.subplots(figsize=(10,10))
# Creating and customizing a word cloud
wordcloud = WordCloud(width=1000, height=1000,
collocations=False,
colormap='cool',
background_color='yellow',
stopwords=STOPWORDS.update(stopwords),
prefer_horizontal=0.85,
random_state=100,
max_words=100,
min_word_length=3).generate(text)
plt.title('Canadian collective description of UFO', fontsize=27)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
# Saving the word cloud
wordcloud.to_file('wordcloud_canada.png')
pip install squarify
import squarify
# Extract the data
states = df1[df1['country']=='USA'].loc[:, 'state'].value_counts()
fig = plt.subplots(figsize=(12,6))
# Creating a treemap
squarify.plot(sizes=states.values, label=states.index)
plt.title('UFO sighting frequencies by state, the USA', fontsize=27)
plt.axis('off')
plt.show()
import matplotlib
# Extracting the data
hours = df1['time'].value_counts()
# Creating a list of colors from 2 matplotlib colormaps
# `Set3` and `tab20`
cmap1 = matplotlib.cm.Set3
cmap2 = matplotlib.cm.tab20
colors = []
for i in range(len(hours.index)):
colors.append(cmap1(i))
if cmap2(i) not in colors:
colors.append(cmap2(i))
fig = plt.subplots(figsize=(12,6))
# Creating and customizing a treemap
squarify.plot(sizes=hours.values, label=hours.index,
color=colors, alpha=0.8,
pad=True,
text_kwargs={'color': 'indigo',
'fontsize': 20,
'fontweight': 'bold'})
plt.title('UFO sighting frequencies by hour', fontsize=27)
plt.axis('off')
plt.show()
pip install matplotlib-venn
from matplotlib_venn import *
# Creating the subsets for crosses and cigars
crosses = df1[(df1['shape']=='cross')&\
(df1['year']>=2015)&(df1['year']<=2019)].loc[:, 'city']
cigars = df1[(df1['shape']=='cigar')&\
(df1['year']>=2015)&(df1['year']<=2019)].loc[:, 'city']
fig = plt.subplots(figsize=(12,8))
# Creating a Venn diagram
venn2(subsets=[set(crosses), set(cigars)],
set_labels=['Crosses', 'Cigars'])
plt.title('Crosses and cigars by number of cities, 2015-2019',
fontsize=27)
plt.show()
# Creating a subset for diamonds
diamonds = df1[(df1['shape']=='diamond')&\
(df1['year']>=2015)&(df1['year']<=2019)].loc[:, 'city']
# Creating a list of subsets
subsets=[set(crosses), set(cigars), set(diamonds)]
fig = plt.subplots(figsize=(15,10))
# Creating a Venn diagram for the 3 subsets
venn3(subsets=subsets,
set_labels=['Crosses', 'Cigars', 'Diamonds'],
set_colors=['magenta', 'dodgerblue', 'gold'],
alpha=0.3)
# Customizing the circumferences of the circles
venn3_circles(subsets=subsets,
color='darkviolet', alpha=0.9,
ls='dotted', lw=4)
plt.title('Crosses, cigars, and diamonds \nby number of cities, 2015-2019', fontsize=26)
plt.show()
print(set(crosses) & set(cigars) & set(diamonds))
df1[df1['country']=='Canada'].loc[:, 'state'].value_counts()[:3]
# Extracting the data for cylinders and cones
# from California and Ontario
CA_ON_cyl_con = df1[((df1['state']=='CA')|(df1['state']=='ON'))&((df1['shape']=='cylinder')|(df1['shape']=='cone'))]
fig = plt.subplots(figsize=(12,7))
sns.set(style='white')
# Creating swarm plots
sns.swarmplot(data=CA_ON_cyl_con, x='year', y='state', palette=['deeppink', 'blue'])
# Creating box plots
sns.boxplot(data=CA_ON_cyl_con, x='year', y='state', palette=['palegreen', 'lemonchiffon'])
plt.title('Cylinders and cones in California and Ontario', fontsize=29)
plt.xlabel('Years', fontsize=18)
plt.ylabel('States', fontsize=18)
sns.despine()
plt.show()
fig = plt.subplots(figsize=(12,7))
# Creating swarm plots
sns.swarmplot(data=CA_ON_cyl_con, x='year', y='state', palette=['deeppink', 'blue'], hue='shape')
# Creating box plots
sns.boxplot(data=CA_ON_cyl_con, x='year', y='state', palette=['palegreen', 'lemonchiffon'])
plt.title('Cylinders and cones in California and Ontario', fontsize=29)
plt.xlim(1997,2020)
plt.xlabel('Years', fontsize=18)
plt.ylabel('States', fontsize=18)
plt.legend(loc='upper left', frameon=False, fontsize=15)
sns.despine()
plt.show()
pip install folium
import folium
from folium.plugins import HeatMap
df_plot = ufo_data[~ufo_data.city_latitude.isna()]
df_plot.shape
# heatmap of locations for first overview
zoom_factor = 2 # inital map size
my_map_1 = folium.Map(location=[0,0], zoom_start=zoom_factor)
HeatMap(data=df_plot[['city_latitude', 'city_longitude']], radius=10).add_to(my_map_1)
my_map_1 # display
pd.pivot_table(df1,index='year',values='state',aggfunc='count').plot(figsize=(10,6))
plt.title('1970 - 2019')